***merge and clean data files***

*merge the seven original data files of each year and erase a file after it has been merged to the larger file*
forval yr = 2003/2010 {
clear all
use "`yr'_1.dta", clear
forval i = 2/7 {
cap merge 1:1 city_id using "`yr'_`i'.dta", force
cap drop _merge
cap erase "`yr'_`i'.dta"
}
cap gen year = `yr'
cap saveold "data_`yr'.dta", replace
cap erase "`yr'_1.dta"
}

*append the merged data of each year together and erase a file after it has been appended to the larger file*
forval yr = 2003/2009 {
append using "data_`yr'.dta", force
cap erase "data_`yr'.dta"
}
cap erase "data_2010.dta"

*drop unwanted variables in this compiled dataset*
cap drop A D E-I K L-J BW BY

*drop overall national level data (city_id="0") and empty rows*
drop if city_id==""|city_id=="."|city_id=="0"

*drop all the provicial level data (city_id ends with "0000") but keep the four municipalities (their city_id also ends with "0000")*
gen temp = substr(city_id, 3,.)
gen flag = 1 if temp=="0000"
replace flag = 0 if city_name=="Beijing"|city_name=="Tianjin"|city_name=="Shanghai"|city_name=="Chongqing"
drop if flag==1
drop flag

sort year city_id 

*Use World Bank China's GDP deflator to deflate nominal values to real values, using year 2000 as the base year*
local var gdp gdp_pc ind_val ind_val_domestic ind_val_hmt ind_val_fie fdi_val ind_fa
foreach i in `var' {
gen `i'_real = `i'/1.0532 if year==2003
replace `i'_real = `i'/1.1262 if year==2004
replace `i'_real = `i'/1.1703 if year==2005
replace `i'_real = `i'/1.2149 if year==2006
replace `i'_real = `i'/1.3077 if year==2007
replace `i'_real = `i'/1.4092 if year==2008
replace `i'_real = `i'/1.4007 if year==2009
replace `i'_real = `i'/1.4936 if year==2010
}

*generate other control variables used in this study*
*domestic share (not used in regression but included here just to show comparison with HMT share and foreign share)*
gen ind_val_domestic_per=ind_val_domestic/ind_val*100
*HMT share*
gen ind_val_hmt_per=ind_val_hmt/ind_val*100
*foreign share*
gen ind_val_fie_per=ind_val_fie/ind_val*100
*capital to labor ratio*
gen kl_ratio_real=ind_fa_real/ind_emp
*share of dirty sector employment*
gen emp_dirty_per=(emp_mine+emp_manu+emp_utility)/emp*100

*transform variables using natural log*
local var so2_emi gdp_real gdp_pc_real ind_val_real ind_val_domestic_real ind_val_hmt_real ind_val_fie_real ind_fa_real kl_ratio_real
foreach i in `var' {
gen ln`i'=ln(`i')
}

*generate the square term of natural log-transformed gdp per capita*
gen lngdp_pc_real_sq=(lngdp_pc_real)^2

*generate city administrative level dummies*
gen level=4
replace level=3 if temp=="0100" /*prefecture-level capital cities (level 3) have city_id ending in "0100"*/
replace level=2 if city_name=="Dalian"|city_name=="Qingdao"|city_name=="Ningbo"|city_name=="Shenzhen"|city_name=="Xiamen"|city_name=="Shenyang"|city_name=="Changchun"|city_name=="Harbin"|city_name=="Nanjing"|city_name=="Hangzhou"|city_name=="Jinan"|city_name=="Wuhan"|city_name=="Guangzhou"|city_name=="Chengdu"|city_name=="Xian"
replace level=1 if city_name=="Beijing"|city_name=="Chongqing"|city_name=="Tianjin"|city_name=="Shanghai"
drop temp
tabulate level, gen(level)

*generate regional dummies*
gen temp = substr(city_id, 1,2) /* "temp" represents the first two numbers in city_id, which is unique to each province*/
destring temp, replace
gen region=0 /*Southwest(provinces): Guangxi, Sichuan, Chongqing, Guizhou, Yunnan, Tibet */
replace region=1 if temp==11|temp==12|temp==13|temp==14|temp==37 /*North: Beijing, Tianjin, Hebei, Shanxi, Shandong*/
replace region=2 if temp==31|temp==32|temp==33 /*East(provinces): Shanghai, Jiangsu, Zhejiang*/
replace region=3 if temp==35|temp==44|temp==46 /*South(provinces): Fujian, Guangdong, Hainan*/
replace region=4 if temp==34|temp==36|temp==41|temp==42|temp==43 /*Middle(provinces): Anhui, Jiangxi, Henan, Hubei, Hunan*/
replace region=5 if temp==21|temp==22|temp==23 /*Northeast(provinces): Liaoning, Jilin, Heilongjiang*/
replace region=6 if temp==15|temp==61|temp==62|temp==63|temp==64|temp==65 /*Northwest(provinces): Inner Mongolia, Shaanxi, Gansu, Qinghai, Ningxia, Xinjiang*/
tabulate region, gen(region)
drop temp

*label variables*
la var so2_emi "industrial SO2 emission"
la var ind_val_real "total industrial output"
la var gdp_pc_real "per capita GDP"
la var kl_ratio_real "capital to labor ratio"
la var lnso2_emi "Log industrial SO2 emission"
la var lnind_val_real "Log total industrial output"
la var ind_val_hmt_per "HMT share"
la var ind_val_fie_per "Foreign share"
la var lngdp_pc_real "Log per capita GDP"
la var lngdp_pc_real_sq "Log per capita GDP square"
la var lnkl_ratio_real "Log capital to labor ratio"

*save final dataset, ready to generate results*
saveold "data_final.dta", replace


